!pip install scipy==1.8.1
!pip install clean-text
from google.colab import files
file_data = files.upload()
import pandas as pd
import numpy as np
import networkx as nx
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import warnings
warnings.filterwarnings('ignore')
df_keyword = pd.read_csv('Keyword_data - Keyword_data.csv')
df_keyword.shape
Clean the Dataset by dropping null columns
df_keyword.dropna(how = "all", inplace=True)
df_keyword.reset_index(drop=True)
# df_keyword = df_keyword.drop(columns = {'index'})
df_keyword
keywords_df = df_keyword.iloc[:,1:]
keywords_df
Find the Unique Keywords from the dataframe and create a List
list_kw = []
keywords_df.apply(lambda x:list_kw.append(x.values), axis = 0)
list_kw = list(set(np.concatenate(list_kw).flat))
len(list_kw)
unique_val = [element for element in list_kw if str(element) != "nan"]
print("Output list is:", unique_val)
#convert the dataframe into dict values by considering first column as keys
mapped_list = df_keyword.set_index('Title').agg(list, axis=1).to_dict()
mapped_list
#find the lengthe of unique values
len_keys = len(unique_val)
#create the nxn zeros matrix with length of unique keywords
adj_matrix = np.zeros((len_keys, len_keys), dtype = int)
#Creating the adjacency matrix by mapping the unique values with the mapped list
for row in range(0, len_keys):
for col in range(0, len_keys):
if row != col :
if (adj_matrix[row][col] == 0) and (adj_matrix[col][row] == 0) :
for title in mapped_list.keys():
if (unique_val[row] in (mapped_list[title])) and (unique_val[col] in (mapped_list[title])):
adj_matrix[row][col] = adj_matrix[row][col] + 1
adj_matrix[col][row] = adj_matrix[col][row] + 1
adj_matrix
#plotting network graph with the unique keywords and adjacency matrix
network = nx.from_numpy_matrix(adj_matrix, parallel_edges=False,)
plt.figure(3,figsize=(10,6))
nx.draw(network,node_color='#6b5b95', node_size=400, font_color='whitesmoke')
plt.show()
#3,4,5 convert the adj matrix to weighted network, degree
print("No of Nodes in the Network ",network.number_of_nodes() )
print("No of Edges in the Network ",network.number_of_edges() )
df_net_degree = pd.DataFrame(network.degree, columns=['No_Of_Nodes', 'No_Of_Degree'])
df_net_degree['Keywords'] = unique_val
df_final_degree = df_net_degree.sort_values(by=['No_Of_Degree'],ascending=False)
#top 10 rows with highest degree
df_final_degree.head(10)
#3,4,5 compute the node strength
df_net_strength = pd.DataFrame(network.degree(weight='weight'), columns=['No_Of_Nodes', 'Strength'])
df_net_strength['Keywords'] = unique_val
df_final_strength = df_net_strength.sort_values(by=['Strength'],ascending=False)
#top 10 rows with highest strength
df_final_strength.head(10)
#6 compute all pairs of weight
lis = []
for i in range(0,len(unique_val)):
for j in range(0,len(unique_val)):
lis.append([df_net_strength['Keywords'][i], df_net_strength['Keywords'][j], adj_matrix[i][j]])
df_net_weights = pd.DataFrame(lis, columns =['keyword1', 'keyword2', 'Weights'], dtype = float)
df_final_weights = df_net_weights.sort_values(by=['Weights'],ascending=False)
#top 10 values from the dataframe
df_final_weights.head(10)
#7. Plot average strength on y-axis and degree on x-axis
sns.lineplot(x=df_net_degree['No_Of_Degree'], y=df_net_strength['Strength'])
#using plotly library
fig = px.line(x=df_net_degree['No_Of_Degree'], y=df_net_strength['Strength'], title='Strength vs Degree')
fig.show()
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from cleantext import clean
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
STOPWORDS = list(set(stopwords.words('english')))
extra_stopwords = ['@','$','.',',','!',"'s",'i','I','&','%','*','"','#','(',')','[',']','{','}','/','?','<','>','`','~','-','_','+','=',' & ',' &','& ','the','3','it','its',"it's",'this','that.','that','...','we','It','yes','no','if','we','If','We',';',':' ' (',')','[',']','{','}','m','t','d']
stop_words = STOPWORDS + extra_stopwords
print(stop_words)
#Mounting files from google drive
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/FDA_Project_3/tweet_datasets
#Changing directory for files and converting the file names to list
dir = '/content/drive/MyDrive/FDA_Project_3/tweet_datasets'
#It gives us the list of all the files present in the current directory
files_dir_list = sorted(os.listdir(dir))
print(pd.DataFrame(files_dir_list,columns=['File_Names']))
#getting the all years data into one dataframe
df_tweets_tempdata = pd.concat((pd.read_csv(f) for f in files_dir_list), ignore_index=True)
df_tweets_tempdata.head(10)
#extracting only required columns from whole dataset
df_tweets_data = df_tweets_tempdata[['id','created_at','date','tweet','language','username','nlikes','nreplies','nretweets']]
df_tweets_data['date'] = pd.to_datetime(df_tweets_data['date'])
df_tweets_data['year'] = pd.DatetimeIndex(df_tweets_data['date']).year
df_tweets_data.dtypes
df_tweets_data['tweet']
#Consider the data only from year 2017 to 2022
#Filtering the data based on year [2017,2018,2019,2020,2021,2022]
list_of_years = [2017,2018,2019,2020,2021,2022]
df_tweets_data = df_tweets_data[df_tweets_data['year'].isin(list_of_years)]
print("Tweets count per year: ")
df_tweets_data.year.value_counts()
#removing stopwords and emojis from the tweets
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet'].apply(lambda tweet: ' '.join([word for word in tweet.split() if word not in (stop_words)]))
df_tweets_data['tweet_without_stopwords'] = df_tweets_data.apply(lambda x: clean(x.tweet_without_stopwords, no_emoji = True), axis = 1)
df_tweets_data.drop(columns=['id','created_at','date'],axis=1,inplace=True)
df_tweets_data = df_tweets_data.reset_index(drop = True)
df_tweets_data
#1. Compute word frequencies for each year. Exclude the stop words
years = [2017,2018,2019,2020,2021,2022]
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace('&','')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace('@','')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace('the','')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace("it's",'')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace("this",'')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace("will",'')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace("would",'')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace("yes",'')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'].str.replace("that's",'')
df_tweets_data['tweet_without_stopwords'] = df_tweets_data['tweet_without_stopwords'] .replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==year]
# Calculating the frequency of each word
res = df_tweets_data_year['tweet_without_stopwords'].str.split(expand=True).stack().value_counts()
df_word_freq_year = res.to_frame().reset_index()
df_word_freq_year = df_word_freq_year.rename(columns= {'index': 'words',0: 'count'})
print('Word frequnecy per year: ',year)
print(df_word_freq_year.sort_values(by=['count'],ascending=False))
#2. Show top 10 words (for each year) by the highest value of word frequency
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==year]
# Calculating the frequency of each word
res = df_tweets_data_year['tweet_without_stopwords'].str.split(expand=True).stack().value_counts()
df_word_freq_year = res.to_frame().reset_index()
df_word_freq_year = df_word_freq_year.rename(columns= {0: 'count'})
df_word_freq_year = df_word_freq_year.rename(columns= {'index': 'words'})
print('Top 10 word frequnecy per year: ',year)
print(df_word_freq_year.head(10))
print('\n')
# who v/s fare barplot
sns.set(rc={"figure.figsize":(23, 8)})
plt.title(f'Top Words and frequencies for the year {year}', size = 16)
sns.barplot(x = 'words',y = 'count',data = df_word_freq_year.head(20))
# Show the plot
plt.show()
print('\n')
#3. Plot Histogram of word frequencies
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==year]
# Calculating the frequency of each word
res = df_tweets_data_year['tweet_without_stopwords'].str.split(expand=True).stack().value_counts()
df_word_freq_year = res.to_frame().reset_index()
df_word_freq_year = df_word_freq_year.rename(columns= {0: 'count'})
df_word_freq_year = df_word_freq_year.rename(columns= {'index': 'words'})
print('\n')
print('Ploting histogram of word frequnecy per year: ',year)
fig = sns.histplot(df_word_freq_year, x ='count', bins =25)
plt.show()
print('\n')
#4. Method-1 Use Zipf's law and plot log-log plots of word frequencies and rank for each year
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==year]
# Calculating the frequency of each word
res = df_tweets_data_year['tweet_without_stopwords'].str.split(expand=True).stack().value_counts()
df_word_freq_year = res.to_frame().reset_index()
df_word_freq_year = df_word_freq_year.rename(columns= {0: 'count'})
df_word_freq_year = df_word_freq_year.rename(columns= {'index': 'words'})
df_word_freq_year['rank'] = range(1,len(df_word_freq_year)+1)
print('\n')
print('Ploting log-log plots of word frequencies and rank for each year: ',year)
plt.loglog(df_word_freq_year['count'], df_word_freq_year['rank'])
plt.xlabel('Word Frequency')
plt.ylabel('Rank')
plt.show()
#4. Method-2 Use Zipf's law and plot log-log plots of word frequencies and rank for each year
import scipy.stats as ss
import math
# Show top 10 words (for each year) by the highest value of word frequency
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==year]
# Calculating the frequency of each word
res = df_tweets_data_year['tweet_without_stopwords'].str.split(expand=True).stack().value_counts()
df_word_freq_year = res.to_frame().reset_index()
df_word_freq_year = df_word_freq_year.rename(columns= {0: 'count'})
df_word_freq_year = df_word_freq_year.rename(columns= {'index': 'words'})
rank_word = ss.rankdata([freq for freq in df_word_freq_year['count'].values.tolist()])
frequencies = [math.log(freq) for freq in df_word_freq_year['count'].values.tolist()]
rank = [math.log(rank) for rank in rank_word]
fig, ax = plt.subplots(figsize = (20,6))
plt.plot(frequencies, rank, 'bo')
print('\n')
plt.title(f"Using zipf's law Log-log plot of word frequencies and rank for the year {year}", size = 16)
plt.xlabel('Word Frequency', size = 14)
plt.ylabel('Word Rank', size = 14)
plt.show()
print('\n')
#5. Method-1 Create bigram network graphs for each year
import nltk
from nltk import bigrams
import itertools
import collections
# Assuming your dataframe is called df and it has a column called 'text'
from nltk.tokenize import RegexpTokenizer
def tokenize_text(text):
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
# tokens = nltk.word_tokenize(text)
return tokens
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==2017]
df_tweets_data_year['tokens'] = df_tweets_data_year['tweet_without_stopwords'].apply(tokenize_text)
df_tweets_data_year['tokens']
bigrams_list = [list(nltk.bigrams(tokens)) for tokens in df_tweets_data_year['tokens']]
# Flatten list of bigrams in clean tweets
bigrams = list(itertools.chain(*bigrams_list))
# Create counter of words in clean bigrams
bigram_counts = collections.Counter(bigrams)
bigram_counts.most_common(50)
bigram_df = pd.DataFrame(bigram_counts.most_common(50),
columns=['bigram', 'count'])
# Create dictionary of bigrams and their counts
d = bigram_df.set_index('bigram').T.to_dict('records')
print(f"Bigram network graphs for each year {year}")
print(bigram_df.head(15))
# Create network plot
G = nx.Graph()
# Create connections between nodes
for k, v in d[0].items():
G.add_edge(k[0], k[1], weight=(v * 10))
G.add_node("tesla", weight=100)
fig, ax = plt.subplots(figsize=(20, 10))
pos = nx.spring_layout(G, k=2)
# Plot networks
nx.draw_networkx(G, pos,
font_size=16,
width=3,
edge_color='grey',
node_color='purple',
with_labels = False,
ax=ax)
# Create offset labels
for key, value in pos.items():
x, y = value[0]+.135, value[1]+.045
ax.text(x, y,
s=key,
bbox=dict(facecolor='red', alpha=0.25),
horizontalalignment='center', fontsize=13)
# nx.draw(G, pos=pos, with_labels=True, node_size=500, font_size=12, width=[bigram_df['count']*0.1 for (u,v,d) in G.edges(data=True)])
# nx.draw_networkx_edge_labels(G, pos=pos, font_size=8)
plt.show()
print('\n')
#5. Method-2 Create bigram network graphs for each year
#In order to reduce the execution time I have taken only 200 elements for the plots
for year in years:
df_tweets_data_year = df_tweets_data[df_tweets_data['year']==year]
# Calculating the frequency of each word
res = df_tweets_data_year['tweet_without_stopwords'].str.split(expand=True).stack().value_counts()
df_word_freq_year = res.to_frame().reset_index()
df_word_freq_year = df_word_freq_year.rename(columns= {0: 'count'})
df_word_freq_year = df_word_freq_year.rename(columns= {'index': 'words'})
bi_words = list(nltk.bigrams(df_word_freq_year.words))
bi_analysis = nltk.FreqDist(bi_words[:200])
print(f"Bigram network graphs for each year {year}")
print('\n')
Net_Graph = nx.Graph()
for index, row in bi_analysis.most_common():
Net_Graph.add_weighted_edges_from([(index[0], index[1], row)])
plt.figure(figsize=(20,10))
options = {
'edge_color': '#987654',
'width': 1.5,
'with_labels': True,
'font_weight': 'bold',
}
nx.draw(Net_Graph, pos=nx.spring_layout(Net_Graph, k=0.25, iterations=10), **options)
axes = plt.gca()
axes.collections[0].set_edgecolor("#987654")
plt.show()
print('\n')
#Additional Analysis of tweets
# Drop rows that has NaN values on selected columns
df_cleaned_likes=df_tweets_data.dropna(subset=['nlikes','nreplies','nretweets'])
df_cleaned_likes.drop(columns=['language','username','tweet_without_stopwords'],inplace=True)
df_cleaned_likes.drop_duplicates(subset=['tweet'],inplace=True)
#Plotting the bar garph to visualize the top liked tweets based on year 2017
df_cleaned_likes_1 = df_cleaned_likes[df_cleaned_likes['year']==2017]
sorted_df = df_cleaned_likes_1.sort_values(by=['nlikes'],ascending=False)
top_20_df = sorted_df.iloc[:20]
print('\n')
fig = px.bar(top_20_df, x='tweet', y='nlikes',title="Top likes for tweets in each year 2017")
fig.update_layout(
xaxis = {
'tickmode': 'array',
'tickvals': list(range(len(top_20_df))),
'ticktext': top_20_df['tweet'].str.slice(-30).tolist(),
}
)
fig.show()
# Drop rows that has NaN values on selected columns
df_cleaned_likes=df_tweets_data.dropna(subset=['nlikes','nreplies','nretweets'])
df_cleaned_likes.drop(columns=['language','username','tweet_without_stopwords'],inplace=True)
df_cleaned_likes.drop_duplicates(subset=['tweet'],inplace=True)
#Plotting the bar garph to visualize the top liked tweets based on year 2018
df_cleaned_likes_2 = df_cleaned_likes[df_cleaned_likes['year']==2018]
sorted_df = df_cleaned_likes_2.sort_values(by=['nlikes'],ascending=False)
top_20_df = sorted_df.iloc[:20]
print('\n')
fig = px.bar(top_20_df, x='tweet', y='nlikes',title="Top likes for tweets in each year 2018")
fig.update_layout(
xaxis = {
'tickmode': 'array',
'tickvals': list(range(len(top_20_df))),
'ticktext': top_20_df['tweet'].str.slice(-30).tolist(),
}
)
fig.show()
# Drop rows that has NaN values on selected columns
df_cleaned_likes=df_tweets_data.dropna(subset=['nlikes','nreplies','nretweets'])
df_cleaned_likes.drop(columns=['language','username','tweet_without_stopwords'],inplace=True)
df_cleaned_likes.drop_duplicates(subset=['tweet'],inplace=True)
#Plotting the bar garph to visualize the top liked tweets based on year 2019
df_cleaned_likes_3 = df_cleaned_likes[df_cleaned_likes['year']==2019]
sorted_df = df_cleaned_likes_3.sort_values(by=['nlikes'],ascending=False)
top_20_df = sorted_df.iloc[:20]
print('\n')
fig = px.bar(top_20_df, x='tweet', y='nlikes',title="Top likes for tweets in each year 2019")
fig.update_layout(
xaxis = {
'tickmode': 'array',
'tickvals': list(range(len(top_20_df))),
'ticktext': top_20_df['tweet'].str.slice(-30).tolist(),
}
)
fig.show()
# Drop rows that has NaN values on selected columns
df_cleaned_likes=df_tweets_data.dropna(subset=['nlikes','nreplies','nretweets'])
df_cleaned_likes.drop(columns=['language','username','tweet_without_stopwords'],inplace=True)
df_cleaned_likes.drop_duplicates(subset=['tweet'],inplace=True)
#Plotting the bar garph to visualize the top liked tweets based on year 2020
df_cleaned_likes_4 = df_cleaned_likes[df_cleaned_likes['year']==2020]
sorted_df = df_cleaned_likes_4.sort_values(by=['nlikes'],ascending=False)
top_20_df = sorted_df.iloc[:20]
print('\n')
fig = px.bar(top_20_df, x='tweet', y='nlikes',title="Top likes for tweets in each year 2020")
fig.update_layout(
xaxis = {
'tickmode': 'array',
'tickvals': list(range(len(top_20_df))),
'ticktext': top_20_df['tweet'].str.slice(-30).tolist(),
}
)
fig.show()
!pwd
%cd /content
!pwd
!jupyter nbconvert --to html 'Project_3_group_16.ipynb'